# Load necessary libraries
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.2
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(lubridate)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
# Let's assume the data is loaded in a dataframe called 'mvp_data'
# Here's what we need to do for preparation:
# 1. Convert season to numeric or factor as appropriate
# 2. Handle missing values
mvp_data <- mvp_data %>%
mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))
# 3. Create an MVP winner indicator
# Typically, the player with the highest award_share wins MVP
mvp_data <- mvp_data %>%
group_by(season) %>%
mutate(is_mvp = award_share == max(award_share)) %>%
ungroup()
# 4. Feature engineering
mvp_data <- mvp_data %>%
mutate(
# Points-rebounds-assists composite
pra = pts_per_g + trb_per_g + ast_per_g,
# Efficiency metrics combination
efficiency = (ts_pct * 100) + efg_pct * 100,
# Two-way player indicator
two_way_score = obpm + dbpm,
# Value metrics
value_composite = (vorp * 5) + ws,
# Team success weight
team_success = win_loss_pct * 100,
# Games played percentage (approximating)
games_played_pct = g / 82,
# Position indicators
is_guard = ifelse(pos %in% c("PG", "SG", "G"), 1, 0),
is_forward = ifelse(pos %in% c("SF", "PF", "F"), 1, 0),
is_center = ifelse(pos %in% c("C"), 1, 0)
)
# 5. Normalize features to avoid scale issues
preproc_params <- preProcess(mvp_data %>% select(where(is.numeric)), method = c("center", "scale"))
mvp_data_scaled <- predict(preproc_params, mvp_data)
# 6. Create time-based features to capture trends
mvp_data_scaled <- mvp_data_scaled %>%
group_by(player) %>%
mutate(
pts_trend = pts_per_g - lag(pts_per_g, default = first(pts_per_g)),
ws_trend = ws - lag(ws, default = first(ws))
) %>%
ungroup()
# 7. Split into training and testing datasets
# Let's use data up to 2018 for training and 2019-2022 for testing
train_data <- mvp_data_scaled %>% filter(season <= 1.22)
test_data <- mvp_data_scaled %>% filter(season > 1.22)
# Load necessary libraries
library(tidyverse)
library(caret)
library(glmnet) # For LASSO and Ridge regression
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
library(xgboost) # For XGBoost
## Warning: package 'xgboost' was built under R version 4.3.3
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:plotly':
##
## slice
## The following object is masked from 'package:dplyr':
##
## slice
library(e1071) # For Support Vector Machines
## Warning: package 'e1071' was built under R version 4.3.3
library(keras) # For neural networks
## Warning: package 'keras' was built under R version 4.3.2
library(rpart) # For decision trees
## Warning: package 'rpart' was built under R version 4.3.3
library(kernlab) # For Gaussian Process
## Warning: package 'kernlab' was built under R version 4.3.3
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
# Build the model using these key components:
# 1. Feature selection to identify important predictors
# 2. Training multiple alternative models
# Since we're not using Random Forest for feature importance anymore,
# let's use a more general correlation approach
correlation_matrix <- cor(
train_data %>%
select(award_share, pts_per_g, trb_per_g, ast_per_g, stl_per_g, blk_per_g,
ts_pct, ws, vorp, bpm, win_loss_pct, pra, efficiency,
two_way_score, value_composite, team_success, games_played_pct)
)
# Find correlations with award_share
award_share_correlations <- correlation_matrix[1, -1]
correlation_df <- data.frame(
Variable = names(award_share_correlations),
Correlation = abs(award_share_correlations)
)
correlation_df <- correlation_df[order(correlation_df$Correlation, decreasing = TRUE), ]
# Select top features based on correlation
top_features <- correlation_df$Variable[1:10]
# Create formula with top features
formula_str <- paste("award_share ~", paste(top_features, collapse = " + "))
formula_obj <- as.formula(formula_str)
# Define common training control
train_control <- trainControl(
method = "cv", # Cross-validation
number = 5, # 5-fold
verboseIter = FALSE
)
# Create a matrix version of the data for models that require it
x_train <- model.matrix(formula_obj, train_data)[,-1] # Remove intercept
y_train <- train_data$award_share
x_test <- model.matrix(formula_obj, test_data)[,-1] # Remove intercept
y_test <- test_data$award_share
# 1. Linear Regression (baseline)
lm_model <- train(
formula_obj,
data = train_data,
method = "lm",
trControl = train_control
)
# 2. LASSO Regression (L1 regularization)
lasso_model <- train(
x = x_train,
y = y_train,
method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(
alpha = 1, # LASSO
lambda = seq(0.001, 0.1, by = 0.001)
)
)
# 3. Ridge Regression (L2 regularization)
ridge_model <- train(
x = x_train,
y = y_train,
method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(
alpha = 0, # Ridge
lambda = seq(0.001, 0.1, by = 0.001)
)
)
# 4. Elastic Net (combination of L1 and L2)
elastic_net_model <- train(
x = x_train,
y = y_train,
method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(
alpha = seq(0, 1, by = 0.2), # Mix of LASSO and Ridge
lambda = seq(0.001, 0.1, by = 0.005)
)
)
# 5. Gradient Boosting Machine (still keeping this one)
gbm_model <- train(
formula_obj,
data = train_data,
method = "gbm",
trControl = train_control,
verbose = FALSE
)
# 6. Support Vector Regression
svr_model <- train(
formula_obj,
data = train_data,
method = "svmRadial",
trControl = train_control,
tuneLength = 5
)
# 7. K-Nearest Neighbors
knn_model <- train(
formula_obj,
data = train_data,
method = "knn",
trControl = train_control,
tuneGrid = expand.grid(k = 1:20)
)
# Store models in a list
models <- list(
LinearRegression = lm_model,
LASSO = lasso_model,
Ridge = ridge_model,
ElasticNet = elastic_net_model,
GradientBoosting = gbm_model,
SupportVectorRegression = svr_model,
KNN = knn_model
)
# Make predictions with each model
predictions <- list()
for (model_name in names(models)) {
# Handle different prediction methods for different model types
if (model_name %in% c("LASSO", "Ridge", "ElasticNet")) {
# Models trained on matrix format
predictions[[model_name]] <- predict(models[[model_name]], newdata = x_test)
} else {
# Models trained on formula
predictions[[model_name]] <- predict(models[[model_name]], newdata = test_data)
}
}
# Calculate RMSE for each model
calculate_rmse <- function(pred, actual) {
sqrt(mean((pred - actual)^2))
}
model_rmse <- sapply(predictions, calculate_rmse, actual = y_test)
print(model_rmse)
## LinearRegression LASSO Ridge
## 0.7921366 0.7925482 0.8000463
## ElasticNet GradientBoosting SupportVectorRegression
## 0.7925482 0.5350029 0.6325693
## KNN
## 0.4928448
# Find the best model (lowest RMSE)
best_model_name <- names(which.min(model_rmse))
best_model <- models[[best_model_name]]
cat("Best model:", best_model_name, "with RMSE:", min(model_rmse), "\n")
## Best model: KNN with RMSE: 0.4928448
# Make predictions with the best model
if (best_model_name %in% c("LASSO", "Ridge", "ElasticNet")) {
test_data$predicted_award_share <- predict(best_model, newdata = x_test)
} else {
test_data$predicted_award_share <- predict(best_model, newdata = test_data)
}
# For each season, predict the MVP (highest predicted award_share)
predicted_mvps <- test_data %>%
group_by(season) %>%
slice_max(order_by = predicted_award_share, n = 1) %>%
select(season, player, predicted_award_share, award_share, is_mvp)
# Calculate accuracy
actual_mvps <- test_data %>%
group_by(season) %>%
slice_max(order_by = award_share, n = 1) %>%
select(season, player, award_share)
comparison <- merge(predicted_mvps, actual_mvps, by = "season", suffixes = c("_predicted", "_actual"))
correct_predictions <- sum(comparison$player_predicted == comparison$player_actual)
total_seasons <- nrow(comparison)
mvp_accuracy <- correct_predictions / total_seasons
cat("MVP Prediction Accuracy:", mvp_accuracy * 100, "%\n")
## MVP Prediction Accuracy: 50 %
# Extract and analyze feature importance for the best model
if (best_model_name == "LinearRegression") {
# For linear regression
coefficients <- coef(best_model$finalModel)[-1] # Remove intercept
var_importance <- data.frame(
Variable = names(coefficients),
Importance = abs(coefficients)
)
var_importance <- var_importance[order(var_importance$Importance, decreasing = TRUE), ]
} else if (best_model_name %in% c("LASSO", "Ridge", "ElasticNet")) {
# For regularized regression
coefficients <- coef(best_model$finalModel, best_model$bestTune$lambda)[, 1][-1] # Remove intercept
var_importance <- data.frame(
Variable = names(coefficients),
Importance = abs(coefficients)
)
var_importance <- var_importance[order(var_importance$Importance, decreasing = TRUE), ]
} else if (best_model_name == "GradientBoosting") {
# For GBM
var_importance <- summary(best_model$finalModel, plotit = FALSE)
var_importance <- var_importance[order(var_importance$rel.inf, decreasing = TRUE), ]
names(var_importance)[names(var_importance) == "rel.inf"] <- "Importance"
} else {
# For other models where we may not have direct feature importance
var_importance <- correlation_df
names(var_importance)[names(var_importance) == "Correlation"] <- "Importance"
}
print(var_importance)
## Variable Importance
## vorp vorp 0.46913580
## value_composite value_composite 0.45290418
## ws ws 0.38803353
## pra pra 0.29506484
## pts_per_g pts_per_g 0.29489024
## ast_per_g ast_per_g 0.20408176
## trb_per_g trb_per_g 0.19426497
## bpm bpm 0.19305511
## two_way_score two_way_score 0.19301911
## stl_per_g stl_per_g 0.18957140
## blk_per_g blk_per_g 0.15308848
## team_success team_success 0.13730372
## win_loss_pct win_loss_pct 0.13730372
## games_played_pct games_played_pct 0.09092698
## ts_pct ts_pct 0.08804256
## efficiency efficiency 0.07878740
library(ggplot2)
library(plotly)
# Compare model performance
model_performance <- data.frame(
Model = names(model_rmse),
RMSE = model_rmse
)
model_performance <- model_performance[order(model_performance$RMSE), ]
# Model comparison visualization
p1 <- ggplot(model_performance, aes(x = reorder(Model, -RMSE), y = RMSE)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
labs(title = "Model Performance Comparison", x = "Model", y = "RMSE (lower is better)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(p1)
# Feature importance visualization
p2 <- ggplot(head(var_importance, 10), aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
theme_minimal() +
labs(title = paste("Feature Importance -", best_model_name), x = "", y = "Importance")
ggplotly(p2)
# Actual vs. Predicted visualization
results <- data.frame(
Player = test_data$player,
Season = test_data$season,
Actual = test_data$award_share,
Predicted = test_data$predicted_award_share
)
p3 <- ggplot(results, aes(x = Actual, y = Predicted)) +
geom_point(alpha = 0.5) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "Actual vs Predicted MVP Award Shares",
x = "Actual Award Share",
y = "Predicted Award Share")
ggplotly(p3)
# Top 10 predicted MVPs visualization
top_predictions <- test_data %>%
arrange(desc(predicted_award_share)) %>%
head(10)
p5 <- ggplot(top_predictions, aes(x = reorder(player, predicted_award_share), y = predicted_award_share)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
theme_minimal() +
labs(title = "Top 10 Predicted MVP Candidates", x = "", y = "Predicted Award Share")
ggplotly(p5)
cat("Best model:", best_model_name, "\n")
## Best model: KNN
cat("Model RMSE:", round(min(model_rmse), 4), "\n")
## Model RMSE: 0.4928
cat("MVP Prediction Accuracy:", round(mvp_accuracy * 100, 1), "%", "\n")
## MVP Prediction Accuracy: 50 %
cat("Top 3 most important features:", "\n")
## Top 3 most important features:
cat("1.", var_importance$Variable[1], "\n")
## 1. vorp
cat("2.", var_importance$Variable[2], "\n")
## 2. value_composite
cat("3.", var_importance$Variable[3], "\n")
## 3. ws